*****************************************************************************************************
* Purpose: Clean diabetes and mortality data for analysis
* Written by: Hunter Green and David Flood
* Last updated: 2024-12-28
* Stata version: 18.0
*****************************************************************************************************

* Toggle for whether David is working on this
global David = "F"


*****************************************************************************************************
* Options, global macros, log
*****************************************************************************************************
* Options
version 18.0
clear all
set more off
set varabbrev off
pause on

* Global folder macros
if "${David}" == "T" {   //David add your folder paths here
	* paper
	global paper_code "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Code"
	global paper_data "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data"
	* HAALSI
	global haalsi "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Code"
	}
else {
	* paper
	global paper_code "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Code"
	global paper_data "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data"
	* HAALSI
	global haalsi "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Code"
}

* Open log
capture log close
log using "${paper_code}/01_diab-mort-clean.log", replace


*****************************************************************************************************
* Run mortality data do files
*****************************************************************************************************
do 01_01_diab-mort-hrs
do 01_02_diab-mort-mhas
do 01_03_diab-mort-charls
do 01_04_diab-mort-haalsi


*****************************************************************************************************
* Pre-process data
*****************************************************************************************************
* RAND HRS data
use hhidpn hhid pn raehsamp raestrat r10agey_e r11agey_e ragender r10diabe r11diabe r10pmbmi r11pmbmi ///
    r10smoken r11smoken h10itot h11itot using "${paper_data}/HRS/randhrs1992_2020v2.dta", clear

* Harmonized HRS data
merge 1:1 hhidpn using "${paper_data}/HRS/H_HRS_d.dta", keepusing(raeducl h10rural h11rural r10rxdiab r11rxdiab r10mbmi r11mbmi)
drop _merge

* HRS 2010 biomarker data
rename hhid HHID
rename pn PN
merge 1:1 HHID PN using "${paper_data}/HRS/BIOMK10BL_R.dta", keepusing(MA1C_ADJ MBIOWGTR)
drop _merge
rename MA1C_ADJ ma1c_adj
rename MBIOWGTR mbiowgtr

* HRS 2012 biomarker data
merge 1:1 HHID PN using "${paper_data}/HRS/BIOMK12BL_R.dta", keepusing(NA1C_ADJ NBIOWGTR)
drop _merge HHID PN
rename NA1C_ADJ na1c_adj
rename NBIOWGTR nbiowgtr

* HRS dates data
merge 1:1 hhidpn using "${paper_data}/hrs_dates.dta", keepusing(mpmelig npmelig w10_ym w11_ym biomarker_ym true_deceased ///
                                                                true_death_ym study_deceased study_death_ym censor_ym)
drop _merge

gen elig10 = 0
replace elig10 = 1 if inlist(mpmelig,1,5,6,7)
gen elig12 = 0
replace elig12 = 1 if inlist(npmelig,1,5,6,7)
drop mpmelig npmelig

* Keep if in waves 10 or 11
keep if elig10 == 1 | elig12 == 1

* Add "h_" before all HRS variables
rename * h_*

* Save temporary file
tempfile hrs_merge
save "`hrs_merge'"
clear

* Harmonized MHAS data
use unhhidnp cunicah np r3agey ragender raeducl h3rural r3diabe r3rxdiab r3smoken r3mbmi hh3ctot1m ///
    h3hhres using "${paper_data}/MHAS/H_MHAS_c2.dta", clear

* MHAS biomarker data
merge 1:1 cunicah np using "${paper_data}/MHAS/sect_l_biomarkers_2012.dta", keepusing(factora_12 reshg2_12)
drop _merge

* MHAS dates data
merge 1:1 cunicah np using "${paper_data}/mhas_dates.dta", keepusing(in2012 iw2012_ym biomarker_ym true_deceased true_death_ym ///
                                                                     study_deceased study_death_ym censor_ym)
drop _merge

* Keep if in 2012 interview (Wave 3)
keep if in2012 == 1
drop cunicah np in2012

* Add "m_" before all MHAS variables
rename * m_*

* Save temporary file
tempfile mhas_merge
save "`mhas_merge'"
clear

* Harmonized CHARLS data
use ID ID_w1 r1agey ragender raeduc_c h1rural r1diabe r1rxdiab r1smoken r1mbmi hh1cperc using "${paper_data}/CHARLS/Harmonized/H_CHARLS_D_Data.dta", clear
rename ID ID_w234
rename ID_w1 ID
replace ID = "_" + string(_n) if mi(ID)

* CHARLS biomarker data
merge 1:1 ID using "${paper_data}/CHARLS/2011/Blood_20140429.dta", keepusing(bloodweight newglu newhba1c qc1_va003)
rename ID ID_w1
rename ID_w234 ID
drop ID_w1 _merge

* CHARLS dates data
merge 1:1 ID using "${paper_data}/charls_dates.dta", keepusing(in2011 iw2011_ym biomarker_ym true_deceased true_death_ym ///
                                                               study_deceased study_death_ym censor_ym)
drop _merge

* Keep if in 2011 interview (Wave 1)
keep if in2011 == 1
drop in2011

* Add "c_" before all CHARLS variables
rename * c_*

* Save temporary file
tempfile charls_merge
save "`charls_merge'"
clear

* HAALSI data
use prim_key w1bd036 w1c_bd_educ4 w1c_bs_bmi w1c_bs_fasting8hr w1c_bs_glucose w1c_hh_totconpc ///
    w1c_rage_calc w2c_rsex w1cm007_females w1cm007_males w1cm011s? w1cm059 ///
    w1cm061 using "${haalsi}/HAALSI W3 Longitudinal Data MAY_18_2023.dta/HAALSI W3 Longitudinal Data MAY_18_2023.dta", clear

* HAALSI HIV data
merge 1:1 prim_key using "${haalsi}/HAALSI_W1_HIVcut.dta", keepusing(hivfinalresult)
drop _merge

* HAALSI dates data
merge 1:1 prim_key using "${paper_data}/haalsi_dates.dta", keepusing(w1_ym biomarker_ym true_deceased true_death_ym ///
                                                                     study_deceased study_death_ym censor_ym)
drop _merge

* Add "z_" before all HAALSI variables
rename * z_*

* Save temporary file
tempfile haalsi_merge
save "`haalsi_merge'"
clear


*****************************************************************************************************
* Append data
*****************************************************************************************************
append using "`hrs_merge'" "`mhas_merge'" "`charls_merge'" "`haalsi_merge'", gen(append)


*****************************************************************************************************
* Assign value labels
*****************************************************************************************************
label define studyid 1 "1.HRS" 2 "2.MHAS" 3 "3.CHARLS" 4 "4.HAALSI"
label define age3 1 "1.51-59" 2 "2.60-69" 3 "3.70+"
label define age11 1 "1.51-54" 2 "2.55-59" 3 "3.60-64" 4 "4.65-69" 5 "5.70-74" ///
                   6 "6.75-79" 7 "7.80-84" 8 "8.85-89" 9 "9.90-94" 10 "10.95-99" 11 "11.100+"
label define sex 1 "1.male" 2 "2.female"
label define yesno 0 "0.no" 1 "1.yes"
label define diab_3cat 0 "0.no diabetes" 1 "1.undiagnosed diabetes" 2 "2.diagnosed diabetes"
label define cendied 0 "0.censored" 1 "1.died"
label define bmi4 1 "1.<18.5 (underweight)" 2 "2.18.5-24.9 (normal weight)" 3 "3.25.0-29.9 (overweight)" ///
                  4 "4.30+ (obese)"
label define tertile 1 "1.low" 2 "2.middle" 3 "3.high"
label define educ3 1 "1.lt upper secondary" 2 "2.upper secondary and vocational" 3 "3.tertiary"
label define educ2 1 "1.lt upper secondary" 2 "2.upper secondary and greater"


*****************************************************************************************************
* Clean data
*****************************************************************************************************
** Study name
rename append study
label variable study "study: study name"
label values study studyid

** ID variables
rename h_hhidpn h_id
label variable h_id "h_id: HRS ID (hhidpn)"

rename m_unhhidnp m_id
label variable m_id "m_id: MHAS ID (unhhidnp)"

rename c_ID c_id
label variable c_id "c_id: CHARLS ID (ID)"

rename z_prim_key z_id
label variable z_id "z_id: HAALSI ID (prim_key)"

** Weight variables
* HRS
gen h_weight =.
replace h_weight = h_mbiowgtr if !mi(h_mbiowgtr) & h_elig10 == 1
replace h_weight = h_nbiowgtr if !mi(h_nbiowgtr) & h_elig12 == 1
label variable h_weight "h_weight: HRS weight"
drop h_mbiowgtr h_nbiowgtr

* MHAS
rename m_factora_12 m_weight
label variable m_weight "m_weight: MHAS weight"

* CHARLS
rename c_bloodweight c_weight
label variable c_weight "c_weight: CHARLS weight"

* Total weight
gen total_weight = h_weight if study == 1
replace total_weight = m_weight if study == 2
replace total_weight = c_weight if study == 3
replace total_weight = 1 if study == 4
label variable total_weight "total_weight: total weight"

** Design variables
* HRS
rename h_raehsamp h_cluster
label variable h_cluster "h_cluster: HRS cluster (raehsamp)"

rename h_raestrat h_strata
label variable h_strata "h_strata: HRS strata (raestrat)"

** Age at biomarker collection (years)
gen agey =.
replace agey = h_r10agey_e if study == 1 & !mi(h_r10agey_e) & h_elig10 == 1
replace agey = h_r11agey_e if study == 1 & !mi(h_r11agey_e) & h_elig12 == 1
replace agey = m_r3agey if study == 2 & !mi(m_r3agey)
replace agey = c_r1agey if study == 3 & !mi(c_r1agey)
replace agey = z_w1c_rage_calc if study == 4 & !mi(z_w1c_rage_calc)
label variable agey "agey: age at biomarker collection (years)"
drop h_r10agey_e h_r11agey_e m_r3agey c_r1agey z_w1c_rage_calc

** Age at biomarker collection
gen agecat =.
replace agecat = 1 if inrange(agey,51,59)
replace agecat = 2 if inrange(agey,60,69)
replace agecat = 3 if inrange(agey,70,200)
label variable agecat "agecat: age category"
label values agecat age3

* create variable for age standardization - age groups
gen agevar =.
replace agevar = 1 if inrange(agey,51,54)
replace agevar = 2 if inrange(agey,55,59)
replace agevar = 3 if inrange(agey,60,64)
replace agevar = 4 if inrange(agey,65,69)
replace agevar = 5 if inrange(agey,70,74)
replace agevar = 6 if inrange(agey,75,79)
replace agevar = 7 if inrange(agey,80,84)
replace agevar = 8 if inrange(agey,85,89)
replace agevar = 9 if inrange(agey,90,94)
replace agevar = 10 if inrange(agey,95,99)
replace agevar = 11 if inrange(agey,100,200)
label variable agevar "agevar: age category for standardization"
label values agevar age11

* create variable for age standardization - population standard weights
* Ahmad OB et al. Age Standardization of Rates: A New WHO Standard. World Health Organization. 2001.
gen ageweightvar =.
replace ageweightvar = 0.2454857 if agevar == 1
replace ageweightvar = 0.2080000 if agevar == 2
replace ageweightvar = 0.1700571 if agevar == 3
replace ageweightvar = 0.1353143 if agevar == 4
replace ageweightvar = 0.1010286 if agevar == 5
replace ageweightvar = 0.0694857 if agevar == 6
replace ageweightvar = 0.0416000 if agevar == 7
replace ageweightvar = 0.0201143 if agevar == 8
replace ageweightvar = 0.0068571 if agevar == 9
replace ageweightvar = 0.0018286 if agevar == 10
replace ageweightvar = 0.0002286 if agevar == 11
label variable ageweightvar "ageweightvar: population standard weights for standardization"

** Gender
gen gender =.
replace gender = 1 if (h_ragender == 1 & study == 1) | (m_ragender == 1 & study == 2) | (c_ragender == 1 & study == 3) | (z_w2c_rsex == 1 & study == 4)
replace gender = 2 if (h_ragender == 2 & study == 1) | (m_ragender == 2 & study == 2) | (c_ragender == 2 & study == 3) | (z_w2c_rsex == 2 & study == 4)
label variable gender "gender: gender"
label values gender sex
drop h_ragender m_ragender c_ragender z_w2c_rsex

** Education
gen educ =.
replace educ = 1 if (h_raeducl == 1 & study == 1) | (m_raeducl == 1 & study == 2) | (inrange(c_raeduc_c,1,5) & study == 3) | inrange(z_w1bd036,1,11) | inrange(z_w1bd036,17,20) 
replace educ = 2 if (h_raeducl == 2 & study == 1) | (m_raeducl == 2 & study == 2) | (inrange(c_raeduc_c,6,7) & study == 3) | inrange(z_w1bd036,12,14)
replace educ = 3 if (h_raeducl == 3 & study == 1) | (m_raeducl == 3 & study == 2) | (inrange(c_raeduc_c,8,10) & study == 3) | inlist(z_w1bd036,15,16,21,22)
label variable educ "educ: education (3 categories)"
label values educ educ3
drop h_raeducl m_raeducl c_raeduc_c z_w1bd036

** Education
gen educ2 =.
replace educ2 = 1 if educ == 1
replace educ2 = 2 if inlist(educ,2,3)
label variable educ2 "educ2: education (2 categories)"
label values educ2 educ2

** Household economic status
gen hh_econ =.
replace hh_econ = h_h10itot if study == 1 & !mi(h_h10itot) & h_elig10 == 1  //annual total income (r+s), nominal dollars
replace hh_econ = h_h11itot if study == 1 & !mi(h_h11itot) & h_elig12 == 1  //annual total income (r+s), nominal dollars
replace hh_econ = ((m_hh3ctot1m * 12) / m_h3hhres) if study == 2 & !mi(m_hh3ctot1m) & !mi(m_h3hhres)  //monthly household consumption converted to annual per-capita, nominal pesos
replace hh_econ = c_hh1cperc if study == 3 & !mi(c_hh1cperc)  //annual household per-capita consumption, nominal yuan
replace hh_econ = (z_w1c_hh_totconpc * 12) if study == 4 & !mi(z_w1c_hh_totconpc) //monthly household per-capital consumption converted to annual, nominal rands
drop h_h10itot h_h11itot m_hh3ctot1m m_h3hhres c_hh1cperc z_w1c_hh_totconpc

** Smoking status at biomarker collection
gen smoker =.
replace smoker = h_r10smoken if study == 1 & !mi(h_r10smoken) & h_elig10 == 1
replace smoker = h_r11smoken if study == 1 & !mi(h_r11smoken) & h_elig12 == 1
replace smoker = m_r3smoken if study == 2 & !mi(m_r3smoken)
replace smoker = c_r1smoken if study == 3 & !mi(c_r1smoken)
replace smoker = 0 if study == 4 & (z_w1cm059 == 2 | z_w1cm061 == 2)
replace smoker = 1 if study == 4 & z_w1cm061 == 1
label variable smoker "smoker: smoking status at biomarker collection"
label values smoker yesno
drop h_r10smoken h_r11smoken m_r3smoken c_r1smoken z_w1cm059 z_w1cm061

** Rural/urban
gen rural =.
replace rural = h_h10rural if study == 1 & !mi(h_h10rural) & h_elig10 == 1
replace rural = h_h11rural if study == 1 & !mi(h_h11rural) & h_elig12 == 1
replace rural = m_h3rural if study == 2 & !mi(m_h3rural)
replace rural = c_h1rural if study == 3 & !mi(c_h1rural)
replace rural = 1 if study == 4
drop h_h10rural h_h11rural m_h3rural c_h1rural

** BMI at biomarker collection
gen mbmi =.
replace mbmi = h_r10pmbmi if study == 1 & !mi(h_r10pmbmi) & h_elig10 == 1
replace mbmi = h_r11pmbmi if study == 1 & !mi(h_r11pmbmi) & h_elig12 == 1
replace mbmi = m_r3mbmi if study == 2 & !mi(m_r3mbmi)
replace mbmi = c_r1mbmi if study == 3 & !mi(c_r1mbmi)
replace mbmi = z_w1c_bs_bmi if study == 4 & inrange(z_w1c_bs_bmi,9.59,158.15)
label variable mbmi "mbmi: measured BMI at biomarker collection"
drop h_r10pmbmi h_r11pmbmi h_r10mbmi h_r11mbmi m_r3mbmi c_r1mbmi z_w1c_bs_bmi

** Categorical bmi variable
gen bmicat =.
replace bmicat = 1 if inrange(mbmi,0,18.499999)
replace bmicat = 2 if inrange(mbmi,18.5,24.999999)
replace bmicat = 3 if inrange(mbmi,25,29.999999)
replace bmicat = 4 if inrange(mbmi,30,3000)
label variable bmicat "bmicat: measured BMI category"
label values bmicat bmi4

** Diabetes diagnosis at biomarker collection
gen diagdiab =.
replace diagdiab = h_r10diabe if study == 1 & !mi(h_r10diabe) & h_elig10 == 1
replace diagdiab = h_r11diabe if study == 1 & !mi(h_r11diabe) & h_elig12 == 1
replace diagdiab = m_r3diabe if study == 2 & !mi(m_r3diabe)
replace diagdiab = c_r1diabe if study == 3 & !mi(c_r1diabe)
replace diagdiab = 0 if study == 4 & (z_w1cm007_females == 2 | z_w1cm007_males == 2)
replace diagdiab = 1 if study == 4 & (z_w1cm007_females == 1 | z_w1cm007_males == 1)
label variable diagdiab  "diagdiab: diagnosed diabetes"
label values diagdiab yesno
drop h_r10diabe h_r11diabe m_r3diabe c_r1diabe z_w1cm007_females z_w1cm007_males

** Diabetes medication at biomarker collection
gen rxdiab =.
replace rxdiab = h_r10rxdiab if study == 1 & !mi(h_r10rxdiab) & h_elig10 == 1
replace rxdiab = h_r11rxdiab if study == 1 & !mi(h_r11rxdiab) & h_elig12 == 1
replace rxdiab = m_r3rxdiab if study == 2 & !mi(m_r3rxdiab)
replace rxdiab = c_r1rxdiab if study == 3 & !mi(c_r1rxdiab)
replace rxdiab = 0 if study == 4 & (diagdiab == 0 | z_w1cm011s1 == 1 | z_w1cm011s2 == 2 | z_w1cm011s5 == 5 | z_w1cm011s6 == 6)
replace rxdiab = 1 if study == 4 & (z_w1cm011s3 == 3 | z_w1cm011s4 == 4)
label variable rxdiab  "rxdiab: diagnosed medication (oral or insulin)"
label values rxdiab yesno
drop h_r10rxdiab h_r11rxdiab m_r3rxdiab c_r1rxdiab z_w1cm011s1 z_w1cm011s2 z_w1cm011s3 z_w1cm011s4 z_w1cm011s5 z_w1cm011s6

** Diabetes medication among diagnosed
gen rxdiab_among_diag =.
replace rxdiab_among_diag = 0 if diagdiab == 1 & rxdiab == 0
replace rxdiab_among_diag = 1 if diagdiab == 1 & rxdiab == 1
label variable rxdiab_among_diag "rxdiab_among_diag: diabetes medication among diagnosed"
label values rxdiab_among_diag yesno

** HIV
gen hiv =.
replace hiv = 0 if study == 4 & z_hivfinalresult == 2 | inrange(study,1,3)
replace hiv = 1 if study == 4 & z_hivfinalresult == 1
label variable hiv "hiv: HIV positive"
label values hiv yesno
drop z_hivfinalresult

** HbA1c
gen hba1c =.
replace hba1c = h_ma1c_adj if study == 1 & !mi(h_ma1c_adj) & h_elig10 == 1
replace hba1c = h_na1c_adj if study == 1 & !mi(h_na1c_adj) & h_elig12 == 1
replace hba1c = m_reshg2_12 if study == 2 & !mi(m_reshg2_12)
replace hba1c = c_newhba1c if study == 3 & !mi(c_newhba1c)
label variable hba1c "hba1c: HbA1c (%)"
drop h_ma1c_adj h_na1c_adj h_elig10 h_elig12 m_reshg2_12 c_newhba1c

** Glucose
gen glucose =.
replace glucose = c_newglu if study == 3 & !mi(c_newglu)
replace glucose = (z_w1c_bs_glucose * 18) if study == 4 & !mi(z_w1c_bs_glucose)
label variable glucose "glucose: glucose (mg/dL)"
drop c_newglu z_w1c_bs_glucose

** Fasting status
gen fasting =.
replace fasting = 0 if study == 3 | study == 4
replace fasting = 1 if (study == 3 & c_qc1_va003 == 1) | (study == 4 & z_w1c_bs_fasting8hr == 1)
label variable fasting "fasting: fasting status"
label values fasting yesno
drop c_qc1_va003 z_w1c_bs_fasting8hr

** Measured hba1c >= 6.5%
gen a1c_ge_65 =.
replace a1c_ge_65 = 0 if inrange(hba1c,0,6.499999)
replace a1c_ge_65 = 1 if inrange(hba1c,6.5,30)
label variable a1c_ge_65 "a1c_ge_65: measured hba1c >= 6.5%"
label values a1c_ge_65 yesno

** Measured glucose >= 126 mg/dL (fasting)/ >= 200 mg/dL (non-fasting)
gen glu_high =.
replace glu_high = 0 if (inrange(glucose,0,125.999999) & fasting == 1) | (inrange(glucose,0,199.999999) & fasting == 0)
replace glu_high = 1 if (inrange(glucose,126,899) & fasting == 1) | (inrange(glucose,200,899) & fasting == 0)
label variable glu_high "glu_high: measured glucose >= 126 mg/dL (fasting) OR >= 200 mg/dL (non-fasting)"
label values glu_high yesno

** Total diabetes (self-report diagnosis + biomarker)
gen totdiab =.
replace totdiab = 0 if diagdiab == 0 & a1c_ge_65 == 0 & inlist(study,1,2)
replace totdiab = 0 if diagdiab == 0 & glu_high == 0 & inlist(study,3,4)
replace totdiab = 1 if (diagdiab == 1 | a1c_ge_65 == 1) & !mi(diagdiab) & !mi(a1c_ge_65) & inlist(study,1,2)
replace totdiab = 1 if (diagdiab == 1 | glu_high == 1) & !mi(diagdiab) & !mi(glu_high) & inlist(study,3,4)
label variable totdiab "totdiab: total diabetes (self-report diagnosis + biomarker)"
label values totdiab yesno

** Total diabetes (self-report medication + biomarker)
gen totdiab_rx =.
replace totdiab_rx = 0 if rxdiab == 0 & a1c_ge_65 == 0 & inlist(study,1,2)
replace totdiab_rx = 0 if rxdiab == 0 & glu_high == 0 & inlist(study,3,4)
replace totdiab_rx = 1 if (rxdiab == 1 | a1c_ge_65 == 1) & !mi(rxdiab) & !mi(a1c_ge_65) & inlist(study,1,2)
replace totdiab_rx = 1 if (rxdiab == 1 | glu_high == 1) & !mi(rxdiab) & !mi(glu_high) & inlist(study,3,4)
label variable totdiab_rx "totdiab_rx: total diabetes (self-report medication + biomarker)"
label values totdiab_rx yesno

** Diagnosed among all with diabetes
gen diag_among_diab =.
replace diag_among_diab = 0 if totdiab == 1 & diagdiab == 0
replace diag_among_diab = 1 if totdiab == 1 & diagdiab == 1
label variable diag_among_diab "diag_among_diab: diagnosed among all diabetes"
label values diag_among_diab yesno

** Diagnosed among all with diabetes (add no diabetes category)
gen diab3 =.
replace diab3 = 0 if totdiab == 0
replace diab3 = 1 if totdiab == 1 & diagdiab == 0
replace diab3 = 2 if totdiab == 1 & diagdiab == 1
label variable diab3 "diab3: diagnosed among all diabetes"
label values diab3 diab_3cat

** Biomarker year and month
gen biomarker_ym =.
replace biomarker_ym = h_biomarker_ym if study == 1
replace biomarker_ym = m_biomarker_ym if study == 2
replace biomarker_ym = c_biomarker_ym if study == 3
replace biomarker_ym = z_biomarker_ym if study == 4
label variable biomarker_ym "biomarker_ym: year and month of biomarker collection"
format biomarker_ym %tm
drop h_biomarker_ym m_biomarker_ym c_biomarker_ym z_biomarker_ym

** Death year and month
gen study_death_ym =.
replace study_death_ym = h_study_death_ym if study == 1
replace study_death_ym = m_study_death_ym if study == 2
replace study_death_ym = c_study_death_ym if study == 3
replace study_death_ym = z_study_death_ym if study == 4
label variable study_death_ym "study_death_ym: year and month of death"
format study_death_ym %tm
drop h_study_death_ym m_study_death_ym c_study_death_ym z_study_death_ym

** Censor year and month
gen censor_ym =.
replace censor_ym = h_censor_ym if study == 1
replace censor_ym = m_censor_ym if study == 2
replace censor_ym = c_censor_ym if study == 3
replace censor_ym = z_censor_ym if study == 4
label variable censor_ym "censor_ym: year and month of censor"
format censor_ym %tm
drop h_censor_ym m_censor_ym c_censor_ym z_censor_ym

** Indicator for death/censor
gen censordied =.
replace censordied = 0 if !mi(censor_ym) & !mi(biomarker_ym)
replace censordied = 1 if !mi(study_death_ym) & !mi(biomarker_ym)
label variable censordied "censordied: indicator of death/censor"
label values censordied cendied

* Time between biomarker collection and death/censor
* 	Set zero values (possible) equal to 0.5
* 	Set negative values (impossible) equal to missing
gen followtime_m = study_death_ym - biomarker_ym
replace followtime_m = censor_ym - biomarker_ym if mi(followtime_m)
replace followtime_m = 0.5 if followtime_m == 0
replace followtime_m = . if inrange(followtime_m,-100,-1)
label variable followtime_m "followtime_m: months between biomarker collection and death/censor"

gen followtime_y = (followtime_m / 12)
label variable followtime_y "followtime_y: years between biomarker collection and death/censor"

** Indicator for non-missing sample
gen nonmiss = 0
replace nonmiss = 1 if !mi(agecat) & !mi(gender) & !mi(educ) & !mi(hh_econ) & !mi(smoker) & !mi(bmicat) & !mi(totdiab) & !mi(total_weight) & !mi(hiv) & inrange(followtime_m,0.5,999999)
label variable nonmiss "nonmiss: nonmissing sample"
label values nonmiss yesno

** Household economic status tertile
* HRS
xtile h_econtert = hh_econ if nonmiss == 1 & study == 1, nquantiles(3)
label variable h_econtert "h_econtert: HRS household economic status tertile"
label values h_econtert tertile

* MHAS
xtile m_econtert = hh_econ if nonmiss == 1 & study == 2, nquantiles(3)
label variable m_econtert "m_econtert: MHAS household economic status tertile"
label values m_econtert tertile

* CHARLS
xtile c_econtert = hh_econ if nonmiss == 1 & study == 3, nquantiles(3)
label variable c_econtert "c_econtert: CHARLS household economic status tertile"
label values c_econtert tertile

* HAALSI
xtile z_econtert = hh_econ if nonmiss == 1 & study == 4, nquantiles(3)
label variable z_econtert "z_econtert: HAALSI household economic status tertile"
label values z_econtert tertile


*****************************************************************************************************
* Save data, close log
*****************************************************************************************************
* Compress data
compress

* Save data
save "${paper_data}/diab_mort_clean.dta", replace


*****************************************************************************************************
* Close log
*****************************************************************************************************
log close

pause
cls

gen sample_h = 1 if study == 1 //total HRS sample
tab sample_h

replace sample_h = 0 if mi(agecat) & study == 1 //aged <51
tab sample_h

replace sample_h = 0 if !inrange(followtime_m,0.5,999999) & study == 1 //invalid followup
tab sample_h

replace sample_h = 0 if mi(hba1c) & study == 1 //no hba1c
tab sample_h

replace sample_h = 0 if mi(diagdiab) & study == 1 //missing diabetes diagnosis
tab sample_h

replace sample_h = 0 if mi(gender) & study == 1 //missing gender
tab sample_h

replace sample_h = 0 if mi(educ) & study == 1 //missing education
tab sample_h

replace sample_h = 0 if mi(hh_econ) & study == 1 //missing economic status
tab sample_h

replace sample_h = 0 if mi(smoker) & study == 1 //missing smoking status
tab sample_h

replace sample_h = 0 if mi(bmicat) & study == 1 //missing bmi
tab sample_h

tab sample_h nonmiss if study == 1
drop sample_h

pause
cls

gen sample_m = 1 if study == 2 //total MHAS sample
tab sample_m

replace sample_m = 0 if mi(agecat) & study == 2 //aged <51
tab sample_m

replace sample_m = 0 if !inrange(followtime_m,0.5,999999) & study == 2 //invalid followup
tab sample_m

replace sample_m = 0 if mi(hba1c) & study == 2 //no hba1c
tab sample_m

replace sample_m = 0 if mi(diagdiab) & study == 2 //missing diabetes diagnosis
tab sample_m

replace sample_m = 0 if mi(gender) & study == 2 //missing gender
tab sample_m

replace sample_m = 0 if mi(educ) & study == 2 //missing education
tab sample_m

replace sample_m = 0 if mi(hh_econ) & study == 2 //missing economic status
tab sample_m

replace sample_m = 0 if mi(smoker) & study == 2 //missing smoking status
tab sample_m

replace sample_m = 0 if mi(bmicat) & study == 2 //missing bmi
tab sample_m

tab sample_m nonmiss if study == 2
drop sample_m

pause
cls

gen sample_c = 1 if study == 3 //total CHARLS sample
tab sample_c

replace sample_c = 0 if mi(agecat) & study == 3 //aged <51
tab sample_c

replace sample_c = 0 if !inrange(followtime_m,0.5,999999) & study == 3 //invalid followup
tab sample_c

replace sample_c = 0 if mi(glucose) & study == 3 //no glucose
tab sample_c

replace sample_c = 0 if mi(diagdiab) & study == 3 //missing diabetes diagnosis
tab sample_c

replace sample_c = 0 if mi(gender) & study == 3 //missing gender
tab sample_c

replace sample_c = 0 if mi(educ) & study == 3 //missing education
tab sample_c

replace sample_c = 0 if mi(hh_econ) & study == 3 //missing economic status
tab sample_c

replace sample_c = 0 if mi(smoker) & study == 3 //missing smoking status
tab sample_c

replace sample_c = 0 if mi(bmicat) & study == 3 //missing bmi
tab sample_c

tab sample_c nonmiss if study == 3
drop sample_c

pause
cls

gen sample_z = 1 if study == 4 //total HAALSI sample
tab sample_z

replace sample_z = 0 if mi(agecat) & study == 4 //aged <51
tab sample_z

replace sample_z = 0 if !inrange(followtime_m,0.5,999999) & study == 4 //invalid followup
tab sample_z

replace sample_z = 0 if mi(glucose) & study == 4 //no glucose
tab sample_z

replace sample_z = 0 if mi(diagdiab) & study == 4 //missing diabetes diagnosis
tab sample_z

replace sample_z = 0 if mi(gender) & study == 4 //missing gender
tab sample_z

replace sample_z = 0 if mi(educ) & study == 4 //missing education
tab sample_z

replace sample_z = 0 if mi(hh_econ) & study == 4 //missing economic status
tab sample_z

replace sample_z = 0 if mi(smoker) & study == 4 //missing smoking status
tab sample_z

replace sample_z = 0 if mi(bmicat) & study == 4 //missing bmi
tab sample_z

replace sample_z = 0 if mi(hiv) & study == 4 // missing hiv
tab sample_z

tab sample_z nonmiss if study == 4
drop sample_z

